
# First, we need to scrap our data from the retailer
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from nltk.corpus import stopwords
# Create lists to store product descriptions, union of product descriptions, product names, product roast levels, product prices and unaltered descriptions
corpus, complete_list, prod_names, prod_roast, prod_price, ori_des = ([] for _ in range(6))
# Roast levels as classified by the retailer
roast_levels=['light','medium','dark','extra-dark']
for roast_level in roast_levels:
url = 'https://coffeecompany.com.au/collections/coffee/'+roast_level
web = requests.get(url)
doc = BeautifulSoup(web.text, 'html.parser')
hrefs = doc.find_all('a', href = re.compile('/collections/coffee/products'))
product_pages = list(set([href['href'] for href in hrefs]))
for product_page in product_pages:
prod_name = product_page.split('/')[-1]
prod_names.append(prod_name)
prod_roast.append(roast_level)
url = 'https://coffeecompany.com.au'+product_page
web = requests.get(url)
prod_page = BeautifulSoup(web.text, 'html.parser')
# The variable words stores the text in the description space
words = prod_page.find('div', class_ = 'description').text.split(' ')
price = prod_page.find('span', class_ = re.compile('price-item')).text
ori_des.append(words)
# After removing non-alphanumeric characters, only feed keywords into the model if the keyword is not a pre-defined stopword
processed_des = [key for word in words if (key:= ''.join([ch for ch in word if ch.isalnum()]).lower()) not in stopwords.words('english')]
prod_price.append(price)
# Roast level is included as part of the corpus, as it may not be brought up in the product description
corpus.append(processed_des+[roast_level])
complete_list.extend(processed_des+[roast_level])
%store corpus complete_list prod_names prod_roast prod_price ori_des
UsageError: Unknown variable 'corpus,'
%store -r corpus complete_list prod_names prod_roast prod_price ori_des
# Preprocessing is needed as TSNE can only recognize numeric inputs
from nltk.stem import PorterStemmer
# Linguistically, words like balance and balanced mean the same thing.
# Stemming can help computer to understand two beans are balanced even if they are differently described
ps = PorterStemmer()
complete_list_stemmed = [ps.stem(word) for word in complete_list]
comment_dedup = list(set(complete_list_stemmed))
# Create a dictionary that stores the assigned index for keywords
comment_idx = {comment_dedup[i]: i for i in range(len(comment_dedup))}
# Do the same for corpus
corpus_stemmed = [[ps.stem(keyword) for keyword in des] for des in corpus]
# Create a numpy array that shows 1 if a keyword is present and 0 otherwise
# max(i) is number of products, max(j) is number of unique keywords present in the the pool of descriptions
comment_dtm = np.array([[1 if comment_dedup[j] in corpus_stemmed[i] else 0
for j in range(len(comment_dedup))] for i in range(len(corpus))])
# Model fitting. While a 3d plot is pretty cool, it is very hard to judge which beans are actually similar visually
from sklearn.manifold import TSNE
model = TSNE(n_components = 2, perplexity = 5, init='random', learning_rate = 'auto', early_exaggeration=30)
tsne_coordinates = model.fit_transform(comment_dtm)
df_coffee = pd.DataFrame({'prod_names':prod_names, 'roast_level':prod_roast,
'price': prod_price, 't-snex':tsne_coordinates[:, 0],
't-sney':tsne_coordinates[:, 1], 'idx': range(len(prod_names))})
display(df_coffee.head())
| prod_names | roast_level | price | t-snex | t-sney | idx | |
|---|---|---|---|---|---|---|
| 0 | brazil-santos_5 | light | $34.00/Kg | 129.260040 | -9.927489 | 0 |
| 1 | mara-deluxe | light | $36.00/Kg | 37.924259 | 25.924942 | 1 |
| 2 | colombia | light | $34.00/Kg | 13.285400 | 54.127468 | 2 |
| 3 | ethiopian-yirgacheffe | light | $36.00/Kg | 2.318302 | 57.104263 | 3 |
| 4 | royal-special | light | $34.00/Kg | 20.383957 | 52.878563 | 4 |
# Now, we need to visualize the result. Plotly is my go-to when I want to create an interactive graph
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode()
fig = px.scatter(df_coffee, x='t-snex', y='t-sney', color="roast_level",
hover_data=['prod_names','roast_level','price'], title='T-SNE plot for coffee beans',
width=800, height=800)
fig.show()
# Disable pretty print first
%pprint
Pretty printing has been turned OFF
# How can we be sure 2 beans are similar, as suggested by the plot? Let's create a function to retrieve the descriptions provided by the retailer
def get_des(prod_1, prod_2):
print(f'Description for {prod_1}: \n' +
' '.join(ori_des[int(df_coffee.query(f'prod_names=="{prod_1}"').idx)]))
print(' ')
print(f'Description for {prod_2}: \n'+
' '.join(ori_des[int(df_coffee.query(f'prod_names=="{prod_2}"').idx)]))
get_des('el-salvador-finca-el-molinito-rainforest-alliance','guatemala-antigua')
Description for el-salvador-finca-el-molinito-rainforest-alliance: An ethically grown high altitude coffee. Medium strong, full bodied with a delicious caramel and toffee aftertaste lingering on the palate. Description for guatemala-antigua: A genuine high altitude shade grown Antigua. It is amongst the very best of Central American coffees with deep body and Belgian Chocolate undertones.